package bswd;

import java.io.BufferedWriter;

import java.io.File;
import java.io.FileWriter;
import java.util.List;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * This class get news title, pub-time and body by entering every news page.
 * The index numbers, 8, 9 and 10 used are MAGIC numbers.
 * @author Kavinyao
 *
 */
public class HtmlUnitTest2 {
	private static final int LIMIT = 16;
	private static final String DOMAIN = "http://jw.nju.edu.cn";
	private static final String JS = "function submit(){var  th=document.form1;th.action=\"down-file.do\";th.submit();}";

	public static void main(String[] args) {
		try{
		    final WebClient client = new WebClient(BrowserVersion.FIREFOX_3_6);
		    client.setJavaScriptEnabled(false);
		    System.out.println("Start connecting...");
            final HtmlPage page = client.getPage(DOMAIN + "/root/index.html");
            //get all the <table> elements
            System.out.println("Successfully connected, start processing...");
            List<HtmlElement> list = page.getElementsByTagName("table");
            //get the one with the list of news
            HtmlElement titles = list.get(8);
            //get all the <td> elements
            list = titles.getElementsByTagName("a");
            
            File ff = new File("D:\\test2.txt");
            BufferedWriter writer = new BufferedWriter(new FileWriter(ff));
            
            HtmlPage tempPage;
            List<HtmlElement> tempList;
            List<HtmlElement> tempList2;
            HtmlElement elem;
            HtmlElement table;
            
            for(int i = 0;i < list.size() && i < LIMIT ;i++){
            	elem = list.get(i);
            	System.out.println("processing element " + (i));
            	System.out.println("element title: " + elem.getTextContent());
            	tempPage = client.getPage(DOMAIN + elem.getAttribute("href").substring(2));
            	System.out.println(tempPage.hashCode());
            	tempList = tempPage.getElementsByTagName("table");
            	//get the <table> element with news
            	table = tempList.get(9);
            	tempList2 = table.getElementsByTagName("td");
            	
            	writer.write(tempList2.get(0).getTextContent() + "\r\n");
            	writer.write(tempList2.get(1).getTextContent().replace("【","").replace("】","") + "\r\n");
            	
            	//get main body
            	table = tempList.get(10);
            	String content = table.getTextContent().replace(JS, "").replace("\n", "\r\n");
            	writer.write(content);
            	writer.write("\r\n");
            }
            System.out.println("All done...");
            writer.close();
        }catch(Exception e){
            System.out.println(e.getMessage());
        }
	}
}
